Contents

Altair

1
2
import altair as alt
from vega_datasets import data

Incomplete notes based on this awesome tutorial by Jake VanderPlas, which I followed up to 1:30:00. Tbc if I start using Altair more in the future.

Vega datasets

1
2
3
# List available datasets

data.list_datasets()[:10]
['7zip',
 'airports',
 'annual-precip',
 'anscombe',
 'barley',
 'birdstrikes',
 'budget',
 'budgets',
 'burtin',
 'cars']
1
2
3
# Get dataset details

data.airports.description
'This dataset lists US airports, including airport code, city, state, latitude, and longitude. This dataset is a subset of the data compiled and published at http://ourairports.com/data/, and is in the public domain.'

Quick overview

This and following sections are based on this tutorial.

1
2
cars = data.cars()
cars.head(2)

Name Miles_per_Gallon Cylinders Displacement Horsepower Weight_in_lbs Acceleration Year Origin
0 chevrolet chevelle malibu 18.0 8 307.0 130.0 3504 12.0 1970-01-01 USA
1 buick skylark 320 15.0 8 350.0 165.0 3693 11.5 1970-01-01 USA
1
2
3
# One point for each car plotted on top of each other

alt.Chart(cars).mark_point()

1
2
3
alt.Chart(cars).mark_point().encode(
    x='Miles_per_Gallon'
)

1
2
3
alt.Chart(cars).mark_line().encode(
    x='Miles_per_Gallon'
)

1
2
3
alt.Chart(cars).mark_tick().encode(
    x='Miles_per_Gallon'
)

1
2
3
4
alt.Chart(cars).mark_point().encode(
    x='Miles_per_Gallon',
    y='Horsepower'
)

1
2
3
4
alt.Chart(cars).mark_point().encode(
    x='Miles_per_Gallon',
    y='Horsepower'
).interactive()

1
2
3
alt.Chart(cars).mark_tick().encode(
    x=alt.X('Miles_per_Gallon', bin=True)
)

1
2
3
4
5
# Get a histogram (boom!)
alt.Chart(cars).mark_bar().encode(
    x=alt.X('Miles_per_Gallon', bin=True),
    y='count()'
)

1
2
3
4
5
6
7
# Get a 2D histogram (boooooom!)
# This is the power of declarative grammar right there!
alt.Chart(cars).mark_bar().encode(
    x=alt.X('Miles_per_Gallon', bin=True),
    y=alt.Y('Horsepower', bin=True),
    color='count()'
)

Leverating grammar of interaction

1
2
3
4
5
6
7
8
9
interval = alt.selection_interval()

alt.Chart(cars).mark_point().encode(
    x='Miles_per_Gallon',
    y='Horsepower',
    color='Origin'
).properties(
    selection=interval
)

1
2
3
4
5
6
7
8
9
interval = alt.selection_interval(encodings=['x'])

alt.Chart(cars).mark_point().encode(
    x='Miles_per_Gallon',
    y='Horsepower',
    color='Origin'
).properties(
    selection=interval
)

1
2
3
4
5
6
7
8
9
interval = alt.selection_interval(encodings=['x', 'y'])

alt.Chart(cars).mark_point().encode(
    x='Miles_per_Gallon',
    y='Horsepower',
    color=alt.condition(interval, 'Origin', alt.value('lightgray'))
).properties(
    selection=interval
)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
# Select on the left and go from left to right

interval = alt.selection_interval(encodings=['x'])

chart = alt.Chart(cars).mark_point().encode(
    x='Miles_per_Gallon',
    y='Horsepower',
    color=alt.condition(interval, 'Origin', alt.value('lightgray')),
    tooltip='Name'
).properties(
    selection=interval
)

chart | chart.encode(x='Acceleration')

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
# Select on the left and go from left to right

interval = alt.selection_interval(encodings=['x'])

chart = alt.Chart(cars).mark_point().encode(
    x='Miles_per_Gallon',
    y='Horsepower',
    color=alt.condition(interval, 'Origin', alt.value('lightgray')),
    tooltip='Name'
).properties(
    selection=interval
)

hist = alt.Chart(cars).mark_bar().encode(
    x='count()',
    y='Origin',
    color='Origin'
)

chart & hist

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
# Select on the left and go from left to right

interval = alt.selection_interval(encodings=['x'])

chart = alt.Chart(cars).mark_point().encode(
    x='Miles_per_Gallon',
    y='Horsepower',
    color=alt.condition(interval, 'Origin', alt.value('lightgray')),
    tooltip='Name'
).properties(
    selection=interval
)

hist = alt.Chart(cars).mark_bar().encode(
    x='count()',
    y='Origin',
    color='Origin'
).transform_filter(
    interval
)

chart & hist

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import altair as alt
from vega_datasets import data

source = data.cars()

# Configure the options common to all layers
brush = alt.selection(type='interval')
base = alt.Chart(source).add_selection(brush)

# Configure the points
points = base.mark_point().encode(
    x=alt.X('Miles_per_Gallon', title=''),
    y=alt.Y('Horsepower', title=''),
    color=alt.condition(brush, 'Origin', alt.value('grey'))
)

# Configure the ticks
tick_axis = alt.Axis(labels=False, domain=False, ticks=False)

x_ticks = base.mark_tick().encode(
    alt.X('Miles_per_Gallon', axis=tick_axis),
    alt.Y('Origin', title='', axis=tick_axis),
    color=alt.condition(brush, 'Origin', alt.value('lightgrey'))
)

y_ticks = base.mark_tick().encode(
    alt.X('Origin', title='', axis=tick_axis),
    alt.Y('Horsepower', axis=tick_axis),
    color=alt.condition(brush, 'Origin', alt.value('lightgrey'))
)

# Build the chart
y_ticks | (points & x_ticks)

Core concepts of simple charts

Three elements of a simple chart:

  • Data, marks, encoding, the three core pieces of an Altair plot
  • Encoding types (Q, N, O, T), which drive the visual representation of encodings.
  • Binning and aggregation, which control aspects of data representation.
1
2
import altair as alt
from vega_datasets import data
1
cars = data.cars()

Marks

  • Many different marks available
  • Use tab completion on alt.Chart.mark to see them all

Encodings

1
2
3
alt.Chart(cars).mark_point().encode(
    x='Origin'
)

1
2
3
alt.Chart(cars).mark_point().encode(
    y='Origin'
)

1
2
3
alt.Chart(cars).mark_point().encode(
    color='Origin'
)

1
2
3
alt.Chart(cars).mark_point().encode(
    opacity='Origin'
)

1
2
3
alt.Chart(cars).mark_point().encode(
    shape='Origin'
)

1
2
3
alt.Chart(cars).mark_point().encode(
    size='Origin'
)

1
2
3
alt.Chart(cars).mark_point().encode(
    row='Origin'
)

1
2
3
alt.Chart(cars).mark_point().encode(
    column='Origin'
)

1
2
3
4
5
6
7
# Stacked bar graph

alt.Chart(cars).mark_bar().encode(
    y='Origin',
    x='count()',
    color='Cylinders'
)

1
2
3
4
5
6
alt.Chart(cars).mark_point().encode(
    x='Displacement', 
    y='Horsepower',
    shape='Origin',
    color='Origin'
)

Datatypes

Datatypes are inferred, but explicitly thinking about and setting them is worth it.

1
2
3
4
5
alt.Chart(cars).mark_tick().encode(
    x='Miles_per_Gallon:Q',
    y='Origin:N',
    color='Cylinders'
)

1
2
3
4
5
alt.Chart(cars).mark_tick().encode(
    x='Miles_per_Gallon:Q',
    y='Origin:N',
    color='Cylinders:O'
)

Binning and aggregation

1
2
3
4
import altair as alt
from vega_datasets import data

cars = data.cars()

Aggregation functions implement split-apply-combine sequence just like groupby().

1
cars.groupby('Origin')['Miles_per_Gallon'].mean()
Origin
Europe    27.891429
Japan     30.450633
USA       20.083534
Name: Miles_per_Gallon, dtype: float64
1
2
3
4
alt.Chart(cars).mark_bar().encode(
    x='mean(Miles_per_Gallon)',
    y='Origin:N'
)

We can easily group data by Origin and cylinders and apply the same mean aggregation:

1
2
3
4
5
6
alt.Chart(cars).mark_bar().encode(
    x='mean(Miles_per_Gallon)',
    y='Cylinders:O',
    row='Origin', 
    color='Origin'
)

1
2
3
4
5
alt.Chart(cars).mark_bar().encode(
    alt.X('Miles_per_Gallon', bin=True),
    alt.Y('count()'),
    alt.Color('Origin')
)

1
2
3
4
5
alt.Chart(cars).mark_bar().encode(
    alt.X('count()'),
    alt.Y('Origin'),
    alt.Color('Miles_per_Galloan:N', bin=True)
)

1
2
3
4
5
alt.Chart(cars).mark_bar().encode(
    alt.Y('Origin'),
    alt.X('Miles_per_Gallon', bin=alt.Bin(maxbins=20)),
    alt.Color('count()')
)

Iris

1
2
iris = data.iris()
iris.head()

sepalLength sepalWidth petalLength petalWidth species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
1
iris.species.unique()
array(['setosa', 'versicolor', 'virginica'], dtype=object)
1
2
3
4
5
6
7
8
alt.Chart(iris).mark_circle().encode(
    x='sepalWidth', 
    y='petalWidth',
    color='species', 
).properties(
    width=300, 
    height=300
)

1
2
3
4
5
6
7
8
9
iris = data.iris()

chart = alt.Chart(iris).mark_point().encode(
    x='petalLength',
    y='sepalWidth',
    color='species'
)

chart | chart.encode(x='sepalLength') | chart.encode(y='petalWidth')